# MVA Midterm Exam
# Name: Sarjak Maniar
# Email: sm2732@scarletmail.rutgers.edu
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(corrplot)
## corrplot 0.92 loaded
library(cluster)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(devtools)
## Loading required package: usethis
library(readr)
library(psych)
# The accompanied dataset gives estimates of the average protein consumption in grams per
# person per day) from different food sources for the inhabitants of 25 European countries.
## Importing Data and initial analysis
## Importing csv file
Protein_Consumption <- read.csv("/Users/sarju/Desktop/MITA Sem 2/MVA/MVA_Midterm/Protein_Consumption.csv")
Protein_Consumption
## Country Red.Meat White.Meat Egg Milk Fish Cereals Starchy.Foods
## 1 Albania 10 1 1 9 0 42 1
## 2 Austria 9 14 4 20 2 28 4
## 3 Belgium 14 9 4 18 5 27 6
## 4 Bulgaria 8 6 2 8 1 57 1
## 5 Czechoslovakia 10 11 3 13 2 34 5
## 6 Denmark 11 11 4 25 10 22 5
## 7 East Germany 8 12 4 11 5 25 7
## 8 Finland 10 5 3 34 6 26 5
## 9 France 18 10 3 20 6 28 5
## 10 Greece 10 3 3 18 6 42 2
## 11 Hungary 5 12 3 10 0 40 4
## 12 Ireland 14 10 5 26 2 24 6
## 13 Italy 9 5 3 14 3 37 2
## 14 Netherlands 10 14 4 23 3 22 4
## 15 Norway 9 5 3 23 10 23 5
## 16 Poland 7 10 3 19 3 36 6
## 17 Portugal 6 4 1 5 14 27 6
## 18 Romania 6 6 2 11 1 50 3
## 19 Spain 7 3 3 9 7 29 6
## 20 Sweden 10 8 4 25 8 20 4
## 21 Switzerland 13 10 3 24 2 26 3
## 22 United Kingdom 17 6 5 21 4 24 5
## 23 USSR 9 5 2 17 3 44 6
## 24 West Germany 11 13 4 19 3 19 5
## 25 Yugoslavia 4 5 1 10 1 56 3
## Pulses.Nuts.and.Oilseeds Fruits.and.Vegetables Total
## 1 6 2 72
## 2 1 4 86
## 3 2 4 89
## 4 4 4 91
## 5 1 4 83
## 6 1 2 91
## 7 1 4 77
## 8 1 1 91
## 9 2 7 99
## 10 8 7 99
## 11 5 4 83
## 12 2 3 92
## 13 4 7 84
## 14 2 4 86
## 15 2 3 83
## 16 2 7 93
## 17 5 8 76
## 18 5 3 87
## 19 6 7 77
## 20 1 2 82
## 21 2 5 88
## 22 3 3 88
## 23 3 3 92
## 24 2 4 80
## 25 6 3 89
Protein_Consumption <- as.data.frame(Protein_Consumption)
Protein_Consumption
## Country Red.Meat White.Meat Egg Milk Fish Cereals Starchy.Foods
## 1 Albania 10 1 1 9 0 42 1
## 2 Austria 9 14 4 20 2 28 4
## 3 Belgium 14 9 4 18 5 27 6
## 4 Bulgaria 8 6 2 8 1 57 1
## 5 Czechoslovakia 10 11 3 13 2 34 5
## 6 Denmark 11 11 4 25 10 22 5
## 7 East Germany 8 12 4 11 5 25 7
## 8 Finland 10 5 3 34 6 26 5
## 9 France 18 10 3 20 6 28 5
## 10 Greece 10 3 3 18 6 42 2
## 11 Hungary 5 12 3 10 0 40 4
## 12 Ireland 14 10 5 26 2 24 6
## 13 Italy 9 5 3 14 3 37 2
## 14 Netherlands 10 14 4 23 3 22 4
## 15 Norway 9 5 3 23 10 23 5
## 16 Poland 7 10 3 19 3 36 6
## 17 Portugal 6 4 1 5 14 27 6
## 18 Romania 6 6 2 11 1 50 3
## 19 Spain 7 3 3 9 7 29 6
## 20 Sweden 10 8 4 25 8 20 4
## 21 Switzerland 13 10 3 24 2 26 3
## 22 United Kingdom 17 6 5 21 4 24 5
## 23 USSR 9 5 2 17 3 44 6
## 24 West Germany 11 13 4 19 3 19 5
## 25 Yugoslavia 4 5 1 10 1 56 3
## Pulses.Nuts.and.Oilseeds Fruits.and.Vegetables Total
## 1 6 2 72
## 2 1 4 86
## 3 2 4 89
## 4 4 4 91
## 5 1 4 83
## 6 1 2 91
## 7 1 4 77
## 8 1 1 91
## 9 2 7 99
## 10 8 7 99
## 11 5 4 83
## 12 2 3 92
## 13 4 7 84
## 14 2 4 86
## 15 2 3 83
## 16 2 7 93
## 17 5 8 76
## 18 5 3 87
## 19 6 7 77
## 20 1 2 82
## 21 2 5 88
## 22 3 3 88
## 23 3 3 92
## 24 2 4 80
## 25 6 3 89
#Dimension of the dataset
dim(Protein_Consumption) # 25 11 [There are 25 rows and 11 columns]
## [1] 25 11
colnames(Protein_Consumption) # printing all the column names
## [1] "Country" "Red.Meat"
## [3] "White.Meat" "Egg"
## [5] "Milk" "Fish"
## [7] "Cereals" "Starchy.Foods"
## [9] "Pulses.Nuts.and.Oilseeds" "Fruits.and.Vegetables"
## [11] "Total"
attach(Protein_Consumption)
# Printing first 5 rows of the dataset
head(Protein_Consumption)
## Country Red.Meat White.Meat Egg Milk Fish Cereals Starchy.Foods
## 1 Albania 10 1 1 9 0 42 1
## 2 Austria 9 14 4 20 2 28 4
## 3 Belgium 14 9 4 18 5 27 6
## 4 Bulgaria 8 6 2 8 1 57 1
## 5 Czechoslovakia 10 11 3 13 2 34 5
## 6 Denmark 11 11 4 25 10 22 5
## Pulses.Nuts.and.Oilseeds Fruits.and.Vegetables Total
## 1 6 2 72
## 2 1 4 86
## 3 2 4 89
## 4 4 4 91
## 5 1 4 83
## 6 1 2 91
# Printing the summary of the dataset
summary(Protein_Consumption)
## Country Red.Meat White.Meat Egg
## Length:25 Min. : 4.0 Min. : 1.00 Min. :1.00
## Class :character 1st Qu.: 8.0 1st Qu.: 5.00 1st Qu.:3.00
## Mode :character Median :10.0 Median : 8.00 Median :3.00
## Mean : 9.8 Mean : 7.92 Mean :3.08
## 3rd Qu.:11.0 3rd Qu.:11.00 3rd Qu.:4.00
## Max. :18.0 Max. :14.00 Max. :5.00
## Milk Fish Cereals Starchy.Foods
## Min. : 5.00 Min. : 0.00 Min. :19.00 Min. :1.00
## 1st Qu.:11.00 1st Qu.: 2.00 1st Qu.:24.00 1st Qu.:3.00
## Median :18.00 Median : 3.00 Median :28.00 Median :5.00
## Mean :17.28 Mean : 4.28 Mean :32.32 Mean :4.36
## 3rd Qu.:23.00 3rd Qu.: 6.00 3rd Qu.:40.00 3rd Qu.:6.00
## Max. :34.00 Max. :14.00 Max. :57.00 Max. :7.00
## Pulses.Nuts.and.Oilseeds Fruits.and.Vegetables Total
## Min. :1.00 Min. :1.0 Min. :72.00
## 1st Qu.:2.00 1st Qu.:3.0 1st Qu.:83.00
## Median :2.00 Median :4.0 Median :87.00
## Mean :3.08 Mean :4.2 Mean :86.32
## 3rd Qu.:5.00 3rd Qu.:5.0 3rd Qu.:91.00
## Max. :8.00 Max. :8.0 Max. :99.00
# Printing the structure of the dataset
str(Protein_Consumption)
## 'data.frame': 25 obs. of 11 variables:
## $ Country : chr "Albania" "Austria" "Belgium" "Bulgaria" ...
## $ Red.Meat : int 10 9 14 8 10 11 8 10 18 10 ...
## $ White.Meat : int 1 14 9 6 11 11 12 5 10 3 ...
## $ Egg : int 1 4 4 2 3 4 4 3 3 3 ...
## $ Milk : int 9 20 18 8 13 25 11 34 20 18 ...
## $ Fish : int 0 2 5 1 2 10 5 6 6 6 ...
## $ Cereals : int 42 28 27 57 34 22 25 26 28 42 ...
## $ Starchy.Foods : int 1 4 6 1 5 5 7 5 5 2 ...
## $ Pulses.Nuts.and.Oilseeds: int 6 1 2 4 1 1 1 1 2 8 ...
## $ Fruits.and.Vegetables : int 2 4 4 4 4 2 4 1 7 7 ...
## $ Total : int 72 86 89 91 83 91 77 91 99 99 ...
# As we can see that all the columns are of type int except the column Country, which is of type chr
# =============================================================================================
# Question 1 - Use principal components analysis to investigate the relationships between the
# countries on the basis of these variables
# Getting the Correlations
cor(Protein_Consumption[-1])
## Red.Meat White.Meat Egg Milk
## Red.Meat 1.00000000 0.18850977 0.57532001 0.5440251
## White.Meat 0.18850977 1.00000000 0.60095535 0.2974816
## Egg 0.57532001 0.60095535 1.00000000 0.6130310
## Milk 0.54402512 0.29748163 0.61303102 1.0000000
## Fish 0.06491072 -0.19719960 0.04780844 0.1624624
## Cereals -0.50970337 -0.43941908 -0.70131040 -0.5924925
## Starchy.Foods 0.15383673 0.33456770 0.41266333 0.2144917
## Pulses.Nuts.and.Oilseeds -0.40988882 -0.67214885 -0.59519381 -0.6238357
## Fruits.and.Vegetables -0.06393465 -0.07329308 -0.16392249 -0.3997753
## Total 0.37369919 0.10308602 0.18970028 0.4603542
## Fish Cereals Starchy.Foods
## Red.Meat 0.06491072 -0.50970337 0.15383673
## White.Meat -0.19719960 -0.43941908 0.33456770
## Egg 0.04780844 -0.70131040 0.41266333
## Milk 0.16246239 -0.59249246 0.21449173
## Fish 1.00000000 -0.51714759 0.43868411
## Cereals -0.51714759 1.00000000 -0.57813449
## Starchy.Foods 0.43868411 -0.57813449 1.00000000
## Pulses.Nuts.and.Oilseeds -0.12226043 0.63605948 -0.49518800
## Fruits.and.Vegetables 0.22948842 0.04229293 0.06835670
## Total -0.09089592 0.18587578 -0.04418245
## Pulses.Nuts.and.Oilseeds Fruits.and.Vegetables
## Red.Meat -0.4098888 -0.06393465
## White.Meat -0.6721488 -0.07329308
## Egg -0.5951938 -0.16392249
## Milk -0.6238357 -0.39977527
## Fish -0.1222604 0.22948842
## Cereals 0.6360595 0.04229293
## Starchy.Foods -0.4951880 0.06835670
## Pulses.Nuts.and.Oilseeds 1.0000000 0.35133227
## Fruits.and.Vegetables 0.3513323 1.00000000
## Total -0.0812251 0.07201466
## Total
## Red.Meat 0.37369919
## White.Meat 0.10308602
## Egg 0.18970028
## Milk 0.46035417
## Fish -0.09089592
## Cereals 0.18587578
## Starchy.Foods -0.04418245
## Pulses.Nuts.and.Oilseeds -0.08122510
## Fruits.and.Vegetables 0.07201466
## Total 1.00000000
# Plotting correlation plot to understand the most correlated variables
Protein_Consumption_plot <- cor(Protein_Consumption[-1])
corrplot(Protein_Consumption_plot, method="circle")

# Inferences:
# We can see from the correlation plot, that Cereal and Eggs are negatively correlated.
# Also, White Meat and Eggs are positively correlated.
# Finding the principal components of data
# Using prcomp to compute the principal components (eigenvalues and eigenvectors).
# With scale=TRUE, variable means are set to zero, and variances set to one
Protein_Consumption_pca <- prcomp(Protein_Consumption[,-1],scale=TRUE)
#Scaling to Standardize the data values
Protein_Consumption_pca
## Standard deviations (1, .., p=10):
## [1] 2.032257e+00 1.319067e+00 1.144237e+00 1.021544e+00 8.360847e-01
## [6] 6.531975e-01 5.841454e-01 4.366348e-01 3.458098e-01 6.618503e-16
##
## Rotation (n x k) = (10 x 10):
## PC1 PC2 PC3 PC4
## Red.Meat -0.3180769 -0.17809245 -0.38142753 -0.039766137
## White.Meat -0.3140588 -0.11783853 0.36420271 0.538507972
## Egg -0.4202281 -0.08236350 0.02047575 0.155623651
## Milk -0.3870300 -0.23356182 -0.19997405 -0.320360929
## Fish -0.1271598 0.57388821 -0.33003267 -0.304161366
## Cereals 0.4177240 -0.31321549 -0.02354236 0.104798477
## Starchy.Foods -0.2880798 0.41038324 0.05768490 0.150709175
## Pulses.Nuts.and.Oilseeds 0.4177658 0.04145202 -0.24796403 0.008042093
## Fruits.and.Vegetables 0.1197680 0.34858202 -0.41210384 0.643455476
## Total -0.1062294 -0.41709540 -0.58081103 0.203145847
## PC5 PC6 PC7 PC8
## Red.Meat 0.53138781 -0.393811788 0.42940825 -0.1592276
## White.Meat -0.09760147 0.309417061 0.09254681 -0.2919567
## Egg 0.26932734 -0.059357751 -0.63995627 -0.2652806
## Milk -0.15848975 0.307976584 -0.17405921 0.5444724
## Fish -0.20323386 0.303075844 0.06315829 -0.5200308
## Cereals -0.29201244 -0.196460437 0.06971238 -0.2001491
## Starchy.Foods -0.42198545 -0.680457657 -0.11769041 0.1889672
## Pulses.Nuts.and.Oilseeds 0.22507285 -0.087921207 -0.57816932 -0.0829400
## Fruits.and.Vegetables 0.16834367 0.222568384 0.08684392 0.3701826
## Total -0.47623561 -0.007702046 -0.05178373 -0.1801923
## PC9 PC10
## Red.Meat -0.17150487 0.20838019
## White.Meat -0.46186736 0.22903415
## Egg 0.48098579 0.06827056
## Milk -0.13218960 0.43456461
## Fish 0.01789764 0.21247753
## Cereals 0.30436394 0.67412235
## Starchy.Foods -0.14706957 0.10134794
## Pulses.Nuts.and.Oilseeds -0.58938418 0.12362100
## Fruits.and.Vegetables 0.20995988 0.11723988
## Total -0.04898111 -0.41440004
# Sample scores -> stored in Protein_Consumption_pca$x
Protein_Consumption_pca$x # returns the sample scores for each country along each principal component.
## PC1 PC2 PC3 PC4 PC5 PC6
## [1,] 3.5978397 -0.64061101 1.1118946 -1.91119245 1.884437106 -0.37593345
## [2,] -1.3862854 -0.70991905 1.1613381 0.93107494 -0.009121937 0.75816906
## [3,] -1.6608482 0.10781730 -0.4231894 0.24680766 0.188016546 -0.91001548
## [4,] 2.9881523 -1.84361307 -0.0730564 0.30616165 -0.134812297 0.29005421
## [5,] -0.3686147 -0.10141825 1.2155042 0.72202089 -0.062918010 -0.37091750
## [6,] -2.4923551 0.18474749 -0.2075253 -0.93906831 -0.822177041 0.65204948
## [7,] -1.2387459 1.58140979 1.9302394 0.77259151 -0.139755937 -0.58954056
## [8,] -1.7732789 -0.75352175 -0.3644876 -2.28429396 -1.224019848 0.17828822
## [9,] -1.6448018 -0.30606640 -2.4846910 1.25325810 0.230223125 -0.33223855
## [10,] 2.0943234 -0.61997417 -3.0846378 0.31332068 0.270784604 0.64981699
## [11,] 1.4808993 -0.43978564 1.6090270 1.21709297 -0.143865961 0.11534733
## [12,] -2.6714332 -1.03848419 -0.2833724 0.15763312 0.181076517 -0.86151844
## [13,] 1.5660043 -0.01064018 -0.5907111 0.54266246 1.069631810 0.77586008
## [14,] -1.7006997 -0.50438298 0.7596605 0.64321026 0.292062273 0.92348043
## [15,] -0.8828201 1.28521025 -0.1832152 -1.71931314 -0.439007528 0.41757899
## [16,] -0.2286613 0.19642466 -0.4058046 1.67696384 -1.334150980 0.08818598
## [17,] 2.0912590 4.41252506 -0.6718598 -0.03434506 -0.291193444 0.33278906
## [18,] 2.6049767 -1.05771521 0.5868844 -0.14252039 -0.533268313 -0.20083289
## [19,] 1.5709389 2.67472726 -0.2892457 0.23912301 0.594881631 -0.60647031
## [20,] -1.8343339 0.36443676 0.5444138 -1.56417414 0.158327086 0.80195706
## [21,] -0.9293183 -0.96269089 -0.3476755 0.27836268 0.755554148 0.70844461
## [22,] -1.9728952 -0.55508144 -0.8727628 -0.60997694 1.396218668 -1.20971357
## [23,] 0.7660628 -0.48463412 -0.2720099 -0.40950179 -1.470304012 -1.24044252
## [24,] -1.6857673 0.30943116 1.2190705 0.55052071 0.810416131 0.20076819
## [25,] 3.7104025 -1.08819138 0.4162119 -0.23641829 -1.227034337 -0.19516642
## PC7 PC8 PC9 PC10
## [1,] 0.6467777066 0.308209567 -0.344610598 -7.771561e-16
## [2,] 0.0005093868 -0.012933034 0.124176638 -9.471590e-16
## [3,] 0.1534640851 -0.334041295 0.023323758 -3.330669e-16
## [4,] 0.5999541449 -0.762640350 0.674235551 1.665335e-16
## [5,] 0.7878924305 -0.039689570 0.241927022 -6.661338e-16
## [6,] -0.0364433564 -0.984127670 -0.168254146 -5.551115e-17
## [7,] -0.0632650200 -0.313388346 0.320254182 -8.881784e-16
## [8,] -0.0506617637 0.792618282 0.004268287 -6.661338e-16
## [9,] 1.3629405718 -0.176345585 -0.392094989 -4.440892e-16
## [10,] -1.1867279230 -0.252605939 -0.185325024 -4.440892e-16
## [11,] -0.8173673169 -0.201792286 -0.496946360 -9.714451e-16
## [12,] -0.7338089555 0.194588527 -0.047542669 -5.551115e-16
## [13,] 0.0085984337 0.435335074 0.815121519 -8.326673e-16
## [14,] -0.2530352518 0.088559649 -0.434700410 -1.051242e-15
## [15,] 0.0122896156 0.009259812 0.182509788 -4.718448e-16
## [16,] -0.0295375727 0.839590880 0.341088667 -7.771561e-16
## [17,] 0.6466024099 -0.205548666 -0.304794550 -1.110223e-15
## [18,] -0.2135771460 -0.211277632 -0.024663621 -3.677614e-16
## [19,] -0.9520057576 0.408309790 0.166895175 -1.221245e-15
## [20,] -0.1459371778 -0.241698086 0.340921291 -6.106227e-16
## [21,] 0.6841927749 0.678069260 -0.252544924 -1.040834e-15
## [22,] -0.4798955917 -0.365578790 0.224480622 -2.636780e-16
## [23,] 0.3126133335 0.287576242 -0.038575860 -1.110223e-16
## [24,] -0.0977006735 0.141407912 -0.415483582 -1.110223e-15
## [25,] -0.1558713867 -0.081857747 -0.353665768 -4.718448e-16
# Sample scores (also called factor scores) are the values of each observation
# (in this case, each country) along each principal component (PC).
# Sample scores basically tells how much each observation contributes to each PC
# Singular values (square roots of eigenvalues) -> stored in Protein_Consumption_pca$sdev
Protein_Consumption_pca$sdev
## [1] 2.032257e+00 1.319067e+00 1.144237e+00 1.021544e+00 8.360847e-01
## [6] 6.531975e-01 5.841454e-01 4.366348e-01 3.458098e-01 6.618503e-16
# Loadings (eigenvectors) -> stored in Protein_Consumption_pca$rotation
Protein_Consumption_pca$rotation # provides the matrix of loadings or principal component coefficients.
## PC1 PC2 PC3 PC4
## Red.Meat -0.3180769 -0.17809245 -0.38142753 -0.039766137
## White.Meat -0.3140588 -0.11783853 0.36420271 0.538507972
## Egg -0.4202281 -0.08236350 0.02047575 0.155623651
## Milk -0.3870300 -0.23356182 -0.19997405 -0.320360929
## Fish -0.1271598 0.57388821 -0.33003267 -0.304161366
## Cereals 0.4177240 -0.31321549 -0.02354236 0.104798477
## Starchy.Foods -0.2880798 0.41038324 0.05768490 0.150709175
## Pulses.Nuts.and.Oilseeds 0.4177658 0.04145202 -0.24796403 0.008042093
## Fruits.and.Vegetables 0.1197680 0.34858202 -0.41210384 0.643455476
## Total -0.1062294 -0.41709540 -0.58081103 0.203145847
## PC5 PC6 PC7 PC8
## Red.Meat 0.53138781 -0.393811788 0.42940825 -0.1592276
## White.Meat -0.09760147 0.309417061 0.09254681 -0.2919567
## Egg 0.26932734 -0.059357751 -0.63995627 -0.2652806
## Milk -0.15848975 0.307976584 -0.17405921 0.5444724
## Fish -0.20323386 0.303075844 0.06315829 -0.5200308
## Cereals -0.29201244 -0.196460437 0.06971238 -0.2001491
## Starchy.Foods -0.42198545 -0.680457657 -0.11769041 0.1889672
## Pulses.Nuts.and.Oilseeds 0.22507285 -0.087921207 -0.57816932 -0.0829400
## Fruits.and.Vegetables 0.16834367 0.222568384 0.08684392 0.3701826
## Total -0.47623561 -0.007702046 -0.05178373 -0.1801923
## PC9 PC10
## Red.Meat -0.17150487 0.20838019
## White.Meat -0.46186736 0.22903415
## Egg 0.48098579 0.06827056
## Milk -0.13218960 0.43456461
## Fish 0.01789764 0.21247753
## Cereals 0.30436394 0.67412235
## Starchy.Foods -0.14706957 0.10134794
## Pulses.Nuts.and.Oilseeds -0.58938418 0.12362100
## Fruits.and.Vegetables 0.20995988 0.11723988
## Total -0.04898111 -0.41440004
# Variable means -> stored in Protein_Consumption_pca$center
Protein_Consumption_pca$center
## Red.Meat White.Meat Egg
## 9.80 7.92 3.08
## Milk Fish Cereals
## 17.28 4.28 32.32
## Starchy.Foods Pulses.Nuts.and.Oilseeds Fruits.and.Vegetables
## 4.36 3.08 4.20
## Total
## 86.32
# Variable standard deviations -> stored in Protein_Consumption_pca$scale
Protein_Consumption_pca$scale
## Red.Meat White.Meat Egg
## 3.403430 3.740766 1.115049
## Milk Fish Cereals
## 7.097652 3.470351 11.010298
## Starchy.Foods Pulses.Nuts.and.Oilseeds Fruits.and.Vegetables
## 1.655295 2.019076 1.914854
## Total
## 6.768309
# A table containing eigenvalues and %'s accounted, follows
# Eigenvalues are sdev^2
#Extract variance against features
eigenvalues<-Protein_Consumption_pca$sdev^2
eigenvalues
## [1] 4.130067e+00 1.739939e+00 1.309278e+00 1.043551e+00 6.990377e-01
## [6] 4.266669e-01 3.412258e-01 1.906500e-01 1.195844e-01 4.380459e-31
sum(eigenvalues)
## [1] 10
names(eigenvalues) <- paste("PC",1:10,sep="")
eigenvalues
## PC1 PC2 PC3 PC4 PC5 PC6
## 4.130067e+00 1.739939e+00 1.309278e+00 1.043551e+00 6.990377e-01 4.266669e-01
## PC7 PC8 PC9 PC10
## 3.412258e-01 1.906500e-01 1.195844e-01 4.380459e-31
sumoflambdas <- sum(eigenvalues)
sumoflambdas
## [1] 10
#Variance %
bfcvar<- (eigenvalues/sumoflambdas)*100
round(bfcvar,10)
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
## 41.300672 17.399386 13.092782 10.435513 6.990377 4.266669 3.412258 1.906500
## PC9 PC10
## 1.195844 0.000000
barplot(bfcvar,main="Scree plot",xlab="Principal Component",ylab="Percent Variation")

#Calculate cumulative of variance
cumvar <- cumsum(bfcvar)
cumvar
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
## 41.30067 58.70006 71.79284 82.22835 89.21873 93.48540 96.89766 98.80416
## PC9 PC10
## 100.00000 100.00000
matlambdas <- rbind(eigenvalues,bfcvar,cumvar)
round(matlambdas,10)
## PC1 PC2 PC3 PC4 PC5 PC6
## eigenvalues 4.130067 1.739939 1.309278 1.043551 0.6990377 0.4266669
## bfcvar 41.300672 17.399386 13.092782 10.435513 6.9903765 4.2666693
## cumvar 41.300672 58.700057 71.792839 82.228352 89.2187289 93.4853982
## PC7 PC8 PC9 PC10
## eigenvalues 0.3412258 0.19065 0.1195844 0
## bfcvar 3.4122581 1.90650 1.1958440 0
## cumvar 96.8976563 98.80416 100.0000000 100
eigenvec_Protein_Consumption <- Protein_Consumption_pca$rotation
eigenvec_Protein_Consumption
## PC1 PC2 PC3 PC4
## Red.Meat -0.3180769 -0.17809245 -0.38142753 -0.039766137
## White.Meat -0.3140588 -0.11783853 0.36420271 0.538507972
## Egg -0.4202281 -0.08236350 0.02047575 0.155623651
## Milk -0.3870300 -0.23356182 -0.19997405 -0.320360929
## Fish -0.1271598 0.57388821 -0.33003267 -0.304161366
## Cereals 0.4177240 -0.31321549 -0.02354236 0.104798477
## Starchy.Foods -0.2880798 0.41038324 0.05768490 0.150709175
## Pulses.Nuts.and.Oilseeds 0.4177658 0.04145202 -0.24796403 0.008042093
## Fruits.and.Vegetables 0.1197680 0.34858202 -0.41210384 0.643455476
## Total -0.1062294 -0.41709540 -0.58081103 0.203145847
## PC5 PC6 PC7 PC8
## Red.Meat 0.53138781 -0.393811788 0.42940825 -0.1592276
## White.Meat -0.09760147 0.309417061 0.09254681 -0.2919567
## Egg 0.26932734 -0.059357751 -0.63995627 -0.2652806
## Milk -0.15848975 0.307976584 -0.17405921 0.5444724
## Fish -0.20323386 0.303075844 0.06315829 -0.5200308
## Cereals -0.29201244 -0.196460437 0.06971238 -0.2001491
## Starchy.Foods -0.42198545 -0.680457657 -0.11769041 0.1889672
## Pulses.Nuts.and.Oilseeds 0.22507285 -0.087921207 -0.57816932 -0.0829400
## Fruits.and.Vegetables 0.16834367 0.222568384 0.08684392 0.3701826
## Total -0.47623561 -0.007702046 -0.05178373 -0.1801923
## PC9 PC10
## Red.Meat -0.17150487 0.20838019
## White.Meat -0.46186736 0.22903415
## Egg 0.48098579 0.06827056
## Milk -0.13218960 0.43456461
## Fish 0.01789764 0.21247753
## Cereals 0.30436394 0.67412235
## Starchy.Foods -0.14706957 0.10134794
## Pulses.Nuts.and.Oilseeds -0.58938418 0.12362100
## Fruits.and.Vegetables 0.20995988 0.11723988
## Total -0.04898111 -0.41440004
# Relation between the variables w.r.t countries
Protein_Consumption_env <- cbind(data.frame(Country),Protein_Consumption_pca$x)
Protein_Consumption_env
## Country PC1 PC2 PC3 PC4 PC5
## 1 Albania 3.5978397 -0.64061101 1.1118946 -1.91119245 1.884437106
## 2 Austria -1.3862854 -0.70991905 1.1613381 0.93107494 -0.009121937
## 3 Belgium -1.6608482 0.10781730 -0.4231894 0.24680766 0.188016546
## 4 Bulgaria 2.9881523 -1.84361307 -0.0730564 0.30616165 -0.134812297
## 5 Czechoslovakia -0.3686147 -0.10141825 1.2155042 0.72202089 -0.062918010
## 6 Denmark -2.4923551 0.18474749 -0.2075253 -0.93906831 -0.822177041
## 7 East Germany -1.2387459 1.58140979 1.9302394 0.77259151 -0.139755937
## 8 Finland -1.7732789 -0.75352175 -0.3644876 -2.28429396 -1.224019848
## 9 France -1.6448018 -0.30606640 -2.4846910 1.25325810 0.230223125
## 10 Greece 2.0943234 -0.61997417 -3.0846378 0.31332068 0.270784604
## 11 Hungary 1.4808993 -0.43978564 1.6090270 1.21709297 -0.143865961
## 12 Ireland -2.6714332 -1.03848419 -0.2833724 0.15763312 0.181076517
## 13 Italy 1.5660043 -0.01064018 -0.5907111 0.54266246 1.069631810
## 14 Netherlands -1.7006997 -0.50438298 0.7596605 0.64321026 0.292062273
## 15 Norway -0.8828201 1.28521025 -0.1832152 -1.71931314 -0.439007528
## 16 Poland -0.2286613 0.19642466 -0.4058046 1.67696384 -1.334150980
## 17 Portugal 2.0912590 4.41252506 -0.6718598 -0.03434506 -0.291193444
## 18 Romania 2.6049767 -1.05771521 0.5868844 -0.14252039 -0.533268313
## 19 Spain 1.5709389 2.67472726 -0.2892457 0.23912301 0.594881631
## 20 Sweden -1.8343339 0.36443676 0.5444138 -1.56417414 0.158327086
## 21 Switzerland -0.9293183 -0.96269089 -0.3476755 0.27836268 0.755554148
## 22 United Kingdom -1.9728952 -0.55508144 -0.8727628 -0.60997694 1.396218668
## 23 USSR 0.7660628 -0.48463412 -0.2720099 -0.40950179 -1.470304012
## 24 West Germany -1.6857673 0.30943116 1.2190705 0.55052071 0.810416131
## 25 Yugoslavia 3.7104025 -1.08819138 0.4162119 -0.23641829 -1.227034337
## PC6 PC7 PC8 PC9 PC10
## 1 -0.37593345 0.6467777066 0.308209567 -0.344610598 -7.771561e-16
## 2 0.75816906 0.0005093868 -0.012933034 0.124176638 -9.471590e-16
## 3 -0.91001548 0.1534640851 -0.334041295 0.023323758 -3.330669e-16
## 4 0.29005421 0.5999541449 -0.762640350 0.674235551 1.665335e-16
## 5 -0.37091750 0.7878924305 -0.039689570 0.241927022 -6.661338e-16
## 6 0.65204948 -0.0364433564 -0.984127670 -0.168254146 -5.551115e-17
## 7 -0.58954056 -0.0632650200 -0.313388346 0.320254182 -8.881784e-16
## 8 0.17828822 -0.0506617637 0.792618282 0.004268287 -6.661338e-16
## 9 -0.33223855 1.3629405718 -0.176345585 -0.392094989 -4.440892e-16
## 10 0.64981699 -1.1867279230 -0.252605939 -0.185325024 -4.440892e-16
## 11 0.11534733 -0.8173673169 -0.201792286 -0.496946360 -9.714451e-16
## 12 -0.86151844 -0.7338089555 0.194588527 -0.047542669 -5.551115e-16
## 13 0.77586008 0.0085984337 0.435335074 0.815121519 -8.326673e-16
## 14 0.92348043 -0.2530352518 0.088559649 -0.434700410 -1.051242e-15
## 15 0.41757899 0.0122896156 0.009259812 0.182509788 -4.718448e-16
## 16 0.08818598 -0.0295375727 0.839590880 0.341088667 -7.771561e-16
## 17 0.33278906 0.6466024099 -0.205548666 -0.304794550 -1.110223e-15
## 18 -0.20083289 -0.2135771460 -0.211277632 -0.024663621 -3.677614e-16
## 19 -0.60647031 -0.9520057576 0.408309790 0.166895175 -1.221245e-15
## 20 0.80195706 -0.1459371778 -0.241698086 0.340921291 -6.106227e-16
## 21 0.70844461 0.6841927749 0.678069260 -0.252544924 -1.040834e-15
## 22 -1.20971357 -0.4798955917 -0.365578790 0.224480622 -2.636780e-16
## 23 -1.24044252 0.3126133335 0.287576242 -0.038575860 -1.110223e-16
## 24 0.20076819 -0.0977006735 0.141407912 -0.415483582 -1.110223e-15
## 25 -0.19516642 -0.1558713867 -0.081857747 -0.353665768 -4.718448e-16
#Visualize PCA using Scree plot
screeplot(Protein_Consumption_pca, type='bar',main='Scree plot')

summary(Protein_Consumption_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
## Standard deviation 2.032 1.319 1.1442 1.0215 0.8361 0.65320 0.58415 0.43663
## Proportion of Variance 0.413 0.174 0.1309 0.1044 0.0699 0.04267 0.03412 0.01906
## Cumulative Proportion 0.413 0.587 0.7179 0.8223 0.8922 0.93485 0.96898 0.98804
## PC9 PC10
## Standard deviation 0.34581 6.619e-16
## Proportion of Variance 0.01196 0.000e+00
## Cumulative Proportion 1.00000 1.000e+00
# Principal Component Analysis is used to reduce the large set of variables to a
# small set that still contains most of the information from the large dataset and thereby,
# reducing the dimension of the dataset.
# After performing PCA on the Protein Consumption Dataset, we obtain 10 Principal components.
# Each of these components represent the percentage of variability present in the dataset.
# In other words, PC1 explains 41.3% of total variance, PC2 explains 17.4% and so on.
# We will consider the first six principal components as they sum up to 93% of total variance
# and the other four can be discarded as they contribute to only 7% of total variance.
# Better Ways to Visualize
library(factoextra)
library(FactoMineR)
library(ggfortify)
library(psych)
library(corrplot)
library(devtools)
# Correlation
pairs.panels(Protein_Consumption[,-1],
gap = 0,
bg = c("red", "blue")[Protein_Consumption$Country],
pch=21)

pairs.panels(Protein_Consumption_pca$x,
gap=0,
bg = c("red", "blue")[Protein_Consumption$Country],
pch=21)

# Variables - PCA (cos2)
fviz_eig(Protein_Consumption_pca, addlabels = TRUE)

fviz_pca_var(Protein_Consumption_pca,col.var = "cos2",
gradient.cols = c("#FFCC00", "#CC9933", "#660033", "#330033"),
repel = TRUE)

# The plot above is also known as Variable Correlation plots.
# It shows the relationships between all variables. It can be interpreted as follow:
# Positively correlated variables are grouped together.
# Negatively correlated variables are positioned on opposite sides of the plot origin (opposed quadrants).
# The distance between variables and the origin measures the quality of the variables on the factor map.
# Variables that are away from the origin are well represented on the factor map.
# Individuals - PCA (cos2)
fviz_pca_ind(Protein_Consumption_pca, col.ind = "cos2",
gradient.cols = c("#FFCC00", "#CC9933", "#660033", "#330033"),
repel = TRUE)

# Biplot
biplot(Protein_Consumption_pca)

autoplot(Protein_Consumption_pca,
data = Protein_Consumption[,-1],
loadings = TRUE,
labels = Protein_Consumption$Country)

# Different PCA Method.
res.pca <- PCA(Protein_Consumption[,-1], graph = FALSE)
print(res.pca)
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 25 individuals, described by 10 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
# Visualize and Interpret PCA using these functions
# get_eigenvalue(res.pca): Extract the eigenvalues/variances of principal components
# fviz_eig(res.pca): Visualize the eigenvalues
# get_pca_ind(res.pca), get_pca_var(res.pca): Extract the results for individuals and variables, respectively.
# fviz_pca_ind(res.pca), fviz_pca_var(res.pca): Visualize the results individuals and variables, respectively.
# fviz_pca_biplot(res.pca): Make a biplot of individuals and variables.
eig.val <- get_eigenvalue(res.pca)
eig.val
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 4.130067e+00 4.130067e+01 41.30067
## Dim.2 1.739939e+00 1.739939e+01 58.70006
## Dim.3 1.309278e+00 1.309278e+01 71.79284
## Dim.4 1.043551e+00 1.043551e+01 82.22835
## Dim.5 6.990377e-01 6.990377e+00 89.21873
## Dim.6 4.266669e-01 4.266669e+00 93.48540
## Dim.7 3.412258e-01 3.412258e+00 96.89766
## Dim.8 1.906500e-01 1.906500e+00 98.80416
## Dim.9 1.195844e-01 1.195844e+00 100.00000
## Dim.10 1.906537e-31 1.906537e-30 100.00000
# Scree Plot
fviz_eig(res.pca, addlabels = TRUE, ylim = c(0, 50))

# As we can see that we got the same plot with this method also.
var <- get_pca_var(res.pca)
# var$coord: coordinates of variables to create a scatter plot
# var$cos2: represents the quality of representation for variables on the factor map. It’s calculated as the squared coordinates: var.cos2 = var.coord * var.coord.
# var$contrib: contains the contributions (in percentage) of the variables to the principal components.
# The contribution of a variable (var) to a given principal component is (in percentage) : (var.cos2 * 100) / (total cos2 of the component).
var
## Principal Component Analysis Results for variables
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the variables"
## 2 "$cor" "Correlations between variables and dimensions"
## 3 "$cos2" "Cos2 for the variables"
## 4 "$contrib" "contributions of the variables"
# Coordinates
head(var$coord)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Red.Meat 0.6464140 -0.2349159 0.43644348 -0.04062284 -0.44428523
## White.Meat 0.6382482 -0.1554370 -0.41673420 0.55010936 0.08160309
## Egg 0.8540114 -0.1086430 -0.02342911 0.15897634 -0.22518048
## Milk 0.7865443 -0.3080838 0.22881770 -0.32726265 0.13251086
## Fish 0.2584213 0.7569972 0.37763557 -0.31071409 0.16992073
## Cereals -0.8489223 -0.4131523 0.02693804 0.10705621 0.24414713
# Cos2: quality on the factore map
head(var$cos2)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Red.Meat 0.41785102 0.05518550 0.1904829088 0.001650215 0.197389362
## White.Meat 0.40736071 0.02416065 0.1736673957 0.302620310 0.006659065
## Egg 0.72933547 0.01180330 0.0005489233 0.025273477 0.050706246
## Milk 0.61865187 0.09491560 0.0523575399 0.107100842 0.017559128
## Fish 0.06678156 0.57304473 0.1426086263 0.096543246 0.028873053
## Cereals 0.72066906 0.17069483 0.0007256581 0.011461032 0.059607823
# Contributions to the principal components
head(var$contrib)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Red.Meat 10.117293 3.1716922 14.54869623 0.1581346 28.2373005
## White.Meat 9.863295 1.3885919 13.26436162 28.9990836 0.9526046
## Egg 17.659167 0.6783745 0.04192564 2.4218721 7.2537218
## Milk 14.979220 5.4551123 3.99896215 10.2631125 2.5119001
## Fish 1.616961 32.9347677 10.89215614 9.2514136 4.1304003
## Cereals 17.449330 9.8103941 0.05542429 1.0982721 8.5271262
# Correlation circle
fviz_pca_var(res.pca, col.var = "black")

# Quality of representation
corrplot(var$cos2, is.corr=FALSE)

# Total cos2 of variables on Dim.1 and Dim.2
# A high cos2 indicates a good representation of the variable on the principal component,
# -> In this case the variable is positioned close to the circumference of the correlation circle.
# A low cos2 indicates that the variable is not perfectly represented by the PCs,
# -> In this case the variable is close to the center of the circle.
fviz_cos2(res.pca, choice = "var", axes = 1:2)

fviz_pca_var(res.pca, col.var = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)

# Change the transparency by cos2 values
fviz_pca_var(res.pca, alpha.var = "cos2")

corrplot(var$contrib, is.corr=FALSE)

# Contributions of variables to PC1
fviz_contrib(res.pca, choice = "var", axes = 1, top = 10)

# Contributions of variables to PC2
fviz_contrib(res.pca, choice = "var", axes = 2, top = 10)

fviz_pca_var(res.pca, col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07")
)

fviz_pca_var(res.pca, alpha.var = "contrib")

# =============================================================================================
# Question 2 - Carry out cluster analysis to study relation between countries on their diet
# K-means Clustering
# Standardizing the data with scale()
matstd.Protein_Consumption <- scale(Protein_Consumption[-1])
matstd.Protein_Consumption
## Red.Meat White.Meat Egg Milk Fish Cereals
## [1,] 0.05876425 -1.84988830 -1.86538958 -1.16658295 -1.23330478 0.8791769
## [2,] -0.23505701 1.62533538 0.82507616 0.38322532 -0.65699414 -0.3923599
## [3,] 1.23404931 0.28871089 0.82507616 0.10144200 0.20747183 -0.4831840
## [4,] -0.52887828 -0.51326380 -0.96856767 -1.30747461 -0.94514946 2.2415378
## [5,] 0.05876425 0.82336069 -0.07174575 -0.60301630 -0.65699414 0.1525844
## [6,] 0.35258552 0.82336069 0.82507616 1.08768362 1.64824845 -0.9373043
## [7,] -0.52887828 1.09068558 0.82507616 -0.88479963 0.20747183 -0.6648321
## [8,] 0.05876425 -0.78058870 -0.07174575 2.35570856 0.49562716 -0.5740081
## [9,] 2.40933437 0.55603579 -0.07174575 0.38322532 0.49562716 -0.3923599
## [10,] 0.05876425 -1.31523850 -0.07174575 0.10144200 0.49562716 0.8791769
## [11,] -1.41034207 1.09068558 -0.07174575 -1.02569129 -1.23330478 0.6975288
## [12,] 1.23404931 0.55603579 1.72189807 1.22857528 -0.65699414 -0.7556562
## [13,] -0.23505701 -0.78058870 -0.07174575 -0.46212464 -0.36883881 0.4250566
## [14,] 0.05876425 1.62533538 0.82507616 0.80590030 -0.36883881 -0.9373043
## [15,] -0.23505701 -0.78058870 -0.07174575 0.80590030 1.64824845 -0.8464803
## [16,] -0.82269954 0.55603579 -0.07174575 0.24233366 -0.36883881 0.3342325
## [17,] -1.11652080 -1.04791360 -1.86538958 -1.73014959 2.80086974 -0.4831840
## [18,] -1.11652080 -0.51326380 -0.96856767 -0.88479963 -0.94514946 1.6057694
## [19,] -0.82269954 -1.31523850 -0.07174575 -1.16658295 0.78378248 -0.3015359
## [20,] 0.05876425 0.02138599 0.82507616 1.08768362 1.07193780 -1.1189524
## [21,] 0.94022805 0.55603579 -0.07174575 0.94679196 -0.65699414 -0.5740081
## [22,] 2.11551310 -0.51326380 1.72189807 0.52411698 -0.08068349 -0.7556562
## [23,] -0.23505701 -0.78058870 -0.96856767 -0.03944966 -0.36883881 1.0608250
## [24,] 0.35258552 1.35801048 0.82507616 0.24233366 -0.36883881 -1.2097765
## [25,] -1.70416333 -0.78058870 -1.86538958 -1.02569129 -0.94514946 2.1507138
## Starchy.Foods Pulses.Nuts.and.Oilseeds Fruits.and.Vegetables Total
## [1,] -2.0298502 1.44620630 -1.1489125 -2.11574280
## [2,] -0.2174840 -1.03017435 -0.1044466 -0.04727917
## [3,] 0.9907602 -0.53489822 -0.1044466 0.39596304
## [4,] -2.0298502 0.45565404 -0.1044466 0.69145784
## [5,] 0.3866381 -1.03017435 -0.1044466 -0.49052138
## [6,] 0.3866381 -1.03017435 -1.1489125 0.69145784
## [7,] 1.5948823 -1.03017435 -0.1044466 -1.37700579
## [8,] 0.3866381 -1.03017435 -1.6711455 0.69145784
## [9,] 0.3866381 -0.53489822 1.4622523 1.87343706
## [10,] -1.4257281 2.43675857 1.4622523 1.87343706
## [11,] -0.2174840 0.95093017 -0.1044466 -0.49052138
## [12,] 0.9907602 -0.53489822 -0.6266796 0.83920525
## [13,] -1.4257281 0.45565404 1.4622523 -0.34277397
## [14,] -0.2174840 -0.53489822 -0.1044466 -0.04727917
## [15,] 0.3866381 -0.53489822 -0.6266796 -0.49052138
## [16,] 0.9907602 -0.53489822 1.4622523 0.98695265
## [17,] 0.9907602 0.95093017 1.9844853 -1.52475319
## [18,] -0.8216060 0.95093017 -0.6266796 0.10046823
## [19,] 0.9907602 1.44620630 1.4622523 -1.37700579
## [20,] -0.2174840 -1.03017435 -1.1489125 -0.63826878
## [21,] -0.8216060 -0.53489822 0.4177864 0.24821564
## [22,] 0.3866381 -0.03962209 -0.6266796 0.24821564
## [23,] 0.9907602 -0.03962209 -0.6266796 0.83920525
## [24,] 0.3866381 -0.53489822 -0.1044466 -0.93376358
## [25,] -0.8216060 1.44620630 -0.6266796 0.39596304
## attr(,"scaled:center")
## Red.Meat White.Meat Egg
## 9.80 7.92 3.08
## Milk Fish Cereals
## 17.28 4.28 32.32
## Starchy.Foods Pulses.Nuts.and.Oilseeds Fruits.and.Vegetables
## 4.36 3.08 4.20
## Total
## 86.32
## attr(,"scaled:scale")
## Red.Meat White.Meat Egg
## 3.403430 3.740766 1.115049
## Milk Fish Cereals
## 7.097652 3.470351 11.010298
## Starchy.Foods Pulses.Nuts.and.Oilseeds Fruits.and.Vegetables
## 1.655295 2.019076 1.914854
## Total
## 6.768309
# kmeans() is the function for non-hierarchical method.
rownames(matstd.Protein_Consumption) <- Protein_Consumption$Country
# For 2 clusters, k-means = 2
kmeans2.Protein_Consumption <- kmeans(matstd.Protein_Consumption,2,nstart = 10)
# Computing the percentage of variation accounted for two clusters
perc.var.2 <- round(100*(1 - kmeans2.Protein_Consumption$betweenss/kmeans2.Protein_Consumption$totss),1)
names(perc.var.2) <- "Perc. 2 clus"
perc.var.2
## Perc. 2 clus
## 64.7
# For 3 clusters, k-means = 3
kmeans3.Protein_Consumption <- kmeans(matstd.Protein_Consumption,3,nstart = 10)
# Computing the percentage of variation accounted for three clusters
perc.var.3 <- round(100*(1 - kmeans3.Protein_Consumption$betweenss/kmeans3.Protein_Consumption$totss),1)
names(perc.var.3) <- "Perc. 3 clus"
perc.var.3
## Perc. 3 clus
## 51.9
# For 4 clusters, k-means = 4
kmeans4.Protein_Consumption <- kmeans(matstd.Protein_Consumption,4,nstart = 10)
# Computing the percentage of variation accounted for four clusters
perc.var.4 <- round(100*(1 - kmeans4.Protein_Consumption$betweenss/kmeans4.Protein_Consumption$totss),1)
names(perc.var.4) <- "Perc. 4 clus"
perc.var.4
## Perc. 4 clus
## 44
# We divide the dataset into two clusters.
# Filtering properties which are in 1 cluster of k mean 2
clus1 <- matrix(names(kmeans2.Protein_Consumption$cluster[kmeans2.Protein_Consumption$cluster == 1]),
ncol=1, nrow=length(kmeans2.Protein_Consumption$cluster[kmeans2.Protein_Consumption$cluster == 1]))
colnames(clus1) <- "Cluster 1"
clus1
## Cluster 1
## [1,] "Albania"
## [2,] "Bulgaria"
## [3,] "Greece"
## [4,] "Hungary"
## [5,] "Italy"
## [6,] "Portugal"
## [7,] "Romania"
## [8,] "Spain"
## [9,] "USSR"
## [10,] "Yugoslavia"
# Filtering properties which are in 2 cluster of k mean 2
clus2 <- matrix(names(kmeans2.Protein_Consumption$cluster[kmeans2.Protein_Consumption$cluster == 2]),
ncol=1, nrow=length(kmeans2.Protein_Consumption$cluster[kmeans2.Protein_Consumption$cluster == 2]))
colnames(clus2) <- "Cluster 2"
clus2
## Cluster 2
## [1,] "Austria"
## [2,] "Belgium"
## [3,] "Czechoslovakia"
## [4,] "Denmark"
## [5,] "East Germany"
## [6,] "Finland"
## [7,] "France"
## [8,] "Ireland"
## [9,] "Netherlands"
## [10,] "Norway"
## [11,] "Poland"
## [12,] "Sweden"
## [13,] "Switzerland"
## [14,] "United Kingdom"
## [15,] "West Germany"
# We can use the hierarchical clustering method, which allows us to create a dendrogram
# to visualize the relationships between the countries based on their protein consumption from different food sources.
# Hierarchical Clustering
# Calculating the distance matrix using Euclidean distance
dist.Protein_Consumption <- dist(Protein_Consumption, method = "euclidean") # Distance matrix
## Warning in dist(Protein_Consumption, method = "euclidean"): NAs introduced by
## coercion
# Performing hierarchical clustering using complete linkage
fit <- hclust(dist.Protein_Consumption, method="complete")
#Plotting the dendogram
plot(fit, main="Dendrogram of European countries based on diet")
#Cutting the tree into 2 clusters
groups <- cutree(fit, k=2)
#Plotting dendogram with red borders around the 2 clusters
rect.hclust(fit, k=2, border="red")

# From the dendrogram, we can see that the countries can be grouped into two main clusters:
# one containing mostly Northern and Western European countries, and another containing mostly
# Southern and Eastern European countries.
# The countries in the second cluster (Northern and Western Europe) generally consume more meat,
# fish, milk, and cereals, while the countries in the first cluster (Southern and Eastern Europe)
# consume more fruits, vegetables, and pulses/nuts/oilseeds.
# Overall, the hierarchical clustering analysis confirms some of the dietary patterns and
# differences that we observed in the earlier principal components analysis.
# Clustering is a method of grouping together a set of objects together in such a way
# that the objects in one cluster is similar to the objects in the same cluster than
# objects present in the different cluster.
# We form 2 clusters for the given dataset as seen in the dendogram.
# This covers all the variance present in the dataset.
# ??????????????????????????
library(cluster)
library(factoextra)
library(magrittr)
library(NbClust)
# We will use agnes function as it allows us to select option for data standardization, the distance measure and clustering algorithm in one single function
(dist.Protein_Consumption <- agnes(Protein_Consumption, metric="euclidean", stand=TRUE, method = "single"))
## Call: agnes(x = Protein_Consumption, metric = "euclidean", stand = TRUE, method = "single")
## Agglomerative coefficient: 0.375437
## Order of objects:
## [1] 1 2 14 24 5 21 7 3 12 22 6 15 20 8 4 11 18 25 23 9 16 13 10 17 19
## Height (summary):
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.107 2.739 3.162 3.174 3.700 4.831
##
## Available components:
## [1] "order" "height" "ac" "merge" "diss" "call" "method" "data"
#View(dist.Protein_Consumption)
# Description of cluster merging
dist.Protein_Consumption$merge
## [,1] [,2]
## [1,] -18 -25
## [2,] -2 -14
## [3,] 2 -24
## [4,] -15 -20
## [5,] 3 -5
## [6,] 5 -21
## [7,] -3 -12
## [8,] 6 -7
## [9,] 7 -22
## [10,] -11 1
## [11,] 8 9
## [12,] -6 4
## [13,] 11 12
## [14,] -4 10
## [15,] 13 -8
## [16,] 14 -23
## [17,] 15 16
## [18,] 17 -9
## [19,] -17 -19
## [20,] 18 -16
## [21,] 20 -13
## [22,] 21 -10
## [23,] 22 19
## [24,] -1 23
# Dendogram
plot(as.dendrogram(dist.Protein_Consumption), xlab= "Distance",xlim=c(8,0),
horiz = TRUE,main="Dendrogram")

#Interactive Plots
#plot(agn.employ,ask=TRUE)
plot(dist.Protein_Consumption, which.plots=1)

plot(dist.Protein_Consumption, which.plots=2)

plot(dist.Protein_Consumption, which.plots=3)
# =============================================================================================
# Question 3 - Identify the important factors underlying the observed variables
# and examine the relationships between the countries with respect to these factors
# Computing Correlation Matrix
corrm.Protein_Consumption <- cor(Protein_Consumption[c(-1,-11)])
corrm.Protein_Consumption
## Red.Meat White.Meat Egg Milk
## Red.Meat 1.00000000 0.18850977 0.57532001 0.5440251
## White.Meat 0.18850977 1.00000000 0.60095535 0.2974816
## Egg 0.57532001 0.60095535 1.00000000 0.6130310
## Milk 0.54402512 0.29748163 0.61303102 1.0000000
## Fish 0.06491072 -0.19719960 0.04780844 0.1624624
## Cereals -0.50970337 -0.43941908 -0.70131040 -0.5924925
## Starchy.Foods 0.15383673 0.33456770 0.41266333 0.2144917
## Pulses.Nuts.and.Oilseeds -0.40988882 -0.67214885 -0.59519381 -0.6238357
## Fruits.and.Vegetables -0.06393465 -0.07329308 -0.16392249 -0.3997753
## Fish Cereals Starchy.Foods
## Red.Meat 0.06491072 -0.50970337 0.1538367
## White.Meat -0.19719960 -0.43941908 0.3345677
## Egg 0.04780844 -0.70131040 0.4126633
## Milk 0.16246239 -0.59249246 0.2144917
## Fish 1.00000000 -0.51714759 0.4386841
## Cereals -0.51714759 1.00000000 -0.5781345
## Starchy.Foods 0.43868411 -0.57813449 1.0000000
## Pulses.Nuts.and.Oilseeds -0.12226043 0.63605948 -0.4951880
## Fruits.and.Vegetables 0.22948842 0.04229293 0.0683567
## Pulses.Nuts.and.Oilseeds Fruits.and.Vegetables
## Red.Meat -0.4098888 -0.06393465
## White.Meat -0.6721488 -0.07329308
## Egg -0.5951938 -0.16392249
## Milk -0.6238357 -0.39977527
## Fish -0.1222604 0.22948842
## Cereals 0.6360595 0.04229293
## Starchy.Foods -0.4951880 0.06835670
## Pulses.Nuts.and.Oilseeds 1.0000000 0.35133227
## Fruits.and.Vegetables 0.3513323 1.00000000
corrplot(corrm.Protein_Consumption,method="number")

fit.bfc <- principal(Protein_Consumption[c(-1,-11)], nfactors=4, rotate="varimax")
fit.bfc
## Principal Components Analysis
## Call: principal(r = Protein_Consumption[c(-1, -11)], nfactors = 4,
## rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
## RC3 RC1 RC2 RC4 h2 u2 com
## Red.Meat 0.08 0.92 0.01 0.02 0.86 0.138 1.0
## White.Meat 0.94 0.14 -0.08 -0.01 0.91 0.086 1.1
## Egg 0.59 0.66 0.13 -0.09 0.81 0.193 2.1
## Milk 0.20 0.68 0.21 -0.51 0.81 0.188 2.3
## Fish -0.21 0.10 0.92 0.09 0.91 0.089 1.2
## Cereals -0.42 -0.56 -0.61 0.07 0.87 0.133 2.8
## Starchy.Foods 0.52 0.01 0.71 0.03 0.77 0.226 1.8
## Pulses.Nuts.and.Oilseeds -0.69 -0.34 -0.28 0.41 0.83 0.166 2.6
## Fruits.and.Vegetables -0.05 -0.04 0.14 0.95 0.93 0.071 1.1
##
## RC3 RC1 RC2 RC4
## SS loadings 2.25 2.21 1.89 1.36
## Proportion Var 0.25 0.25 0.21 0.15
## Cumulative Var 0.25 0.50 0.71 0.86
## Proportion Explained 0.29 0.29 0.25 0.18
## Cumulative Proportion 0.29 0.58 0.82 1.00
##
## Mean item complexity = 1.8
## Test of the hypothesis that 4 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0.05
## with the empirical chi square 4.94 with prob < 0.55
##
## Fit based upon off diagonal values = 0.98
# Note that here, we are excluding/ignoring the column 'total', because it is just having the summation of all the column values
# We are considering 4 factors, because of VSS (Very Simple Structure)
round(fit.bfc$values, 3)
## [1] 4.096 1.625 1.085 0.905 0.427 0.347 0.270 0.135 0.111
# Loadings
fit.bfc$loadings
##
## Loadings:
## RC3 RC1 RC2 RC4
## Red.Meat 0.925
## White.Meat 0.941 0.142
## Egg 0.594 0.655 0.128
## Milk 0.197 0.684 0.208 -0.513
## Fish -0.214 0.921
## Cereals -0.418 -0.557 -0.614
## Starchy.Foods 0.518 0.711
## Pulses.Nuts.and.Oilseeds -0.688 -0.337 -0.277 0.413
## Fruits.and.Vegetables 0.144 0.951
##
## RC3 RC1 RC2 RC4
## SS loadings 2.249 2.207 1.895 1.361
## Proportion Var 0.250 0.245 0.211 0.151
## Cumulative Var 0.250 0.495 0.706 0.857
# Factor Loadings: This table displays the factor loadings for each variable.
# It represents the correlation between each variable and each factor.
# Communalities
fit.bfc$communality
## Red.Meat White.Meat Egg
## 0.8623000 0.9136165 0.8067014
## Milk Fish Cereals
## 0.8123492 0.9110157 0.8666324
## Starchy.Foods Pulses.Nuts.and.Oilseeds Fruits.and.Vegetables
## 0.7744128 0.8343631 0.9293892
# Communalities are estimates of the variance in each observed variable that can be
# explained by the extracted factors.
# It shows the proportion of each variable's variance that can be explained by the
# factors, e.g., Red Meat explains 0.862, White Meat explains 0.913, and so on.
# Rotated factor scores
head(fit.bfc$scores)
## RC3 RC1 RC2 RC4
## [1,] -1.9894396 -0.15885917 -1.3822473 -0.74679784
## [2,] 1.4868236 -0.02157573 -0.6102314 -0.11610160
## [3,] 0.4131939 0.75269512 0.3257944 0.20157345
## [4,] -0.6801210 -0.67812717 -1.7099907 0.01919653
## [5,] 1.0805053 -0.48828520 -0.3651871 0.01149907
## [6,] 0.2912014 0.38648634 1.1836494 -1.20111092
round(fit.bfc$values,3)
## [1] 4.096 1.625 1.085 0.905 0.427 0.347 0.270 0.135 0.111
# Factor recommendation
fa.parallel(Protein_Consumption[c(-1,-11)])

## Parallel analysis suggests that the number of factors = 1 and the number of components = 1
# From this, we can inference that 4th component could be the best choice for number of factors
# Correlations within Factors
fa.plot(fit.bfc)

# Visualizing the relationship
fa.diagram(fit.bfc)

# Factor recommendations for a simple structure
vss(Protein_Consumption[c(-1,-11)])
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
##
## Very Simple Structure
## Call: vss(x = Protein_Consumption[c(-1, -11)])
## VSS complexity 1 achieves a maximimum of 0.76 with 1 factors
## VSS complexity 2 achieves a maximimum of 0.87 with 2 factors
##
## The Velicer MAP achieves a minimum of 0.08 with 1 factors
## BIC achieves a minimum of -40 with 1 factors
## Sample Size adjusted BIC achieves a minimum of 1.3 with 5 factors
##
## Statistics by number of factors
## vss1 vss2 map dof chisq prob sqresid fit RMSEA BIC SABIC complex
## 1 0.76 0.00 0.084 27 4.7e+01 0.01 5.31 0.76 0.167 -40.0 43.7 1.0
## 2 0.72 0.87 0.098 19 2.7e+01 0.11 2.90 0.87 0.121 -34.4 24.6 1.3
## 3 0.49 0.84 0.110 12 1.4e+01 0.27 1.71 0.92 0.081 -24.1 13.1 1.7
## 4 0.51 0.82 0.144 6 5.4e+00 0.49 0.78 0.96 0.000 -13.9 4.7 1.8
## 5 0.43 0.75 0.208 1 1.4e+00 0.23 0.67 0.97 0.123 -1.8 1.3 2.0
## 6 0.52 0.74 0.301 -3 4.9e-08 NA 0.45 0.98 NA NA NA 2.2
## 7 0.46 0.74 0.496 -6 4.7e-08 NA 0.42 0.98 NA NA NA 2.3
## 8 0.46 0.75 1.000 -8 0.0e+00 NA 0.40 0.98 NA NA NA 2.3
## eChisq SRMR eCRMS eBIC
## 1 3.9e+01 1.5e-01 0.169 -48
## 2 1.5e+01 9.1e-02 0.125 -46
## 3 5.3e+00 5.4e-02 0.094 -33
## 4 8.7e-01 2.2e-02 0.054 -18
## 5 2.4e-01 1.2e-02 0.070 -3
## 6 5.5e-09 1.8e-06 NA NA
## 7 3.8e-09 1.5e-06 NA NA
## 8 1.7e-19 9.8e-12 NA NA
summary(vss(Protein_Consumption[c(-1,-11)]) )
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## An ultra-Heywood case was detected. Examine the results carefully

##
## Very Simple Structure
## VSS complexity 1 achieves a maximimum of 0.76 with 1 factors
## VSS complexity 2 achieves a maximimum of 0.87 with 2 factors
##
## The Velicer MAP criterion achieves a minimum of 1 with 1 factors
##
# Overall, the factor analysis suggests that there are four underlying factors that
# explain most of the variance in the dataset. These factors represent different aspects
# of the European diet, such as a diet high in Milk, Egg and Fish (factor 1), a diet high
# in fish, starchy food and cereals (factor 2), a diet high in white meat, pulses/nuts/oilseeds
# (factor 3), and a diet high in fruits and vegetables (factor 4).
# The factor scores for each country provide a way to compare countries based on their diet patterns and
# identify similarities and differences between them.
# Factor Analysis is the method of identifying the latent relational structure among
# a set of variables and then narrowing it down to a smaller number of variables.
# We can see that we have reduced the factors to five, this contains most of the
# information in the dataset.
# The factors RC1, RC2, RC3, RC4 can be helpful in analyzing the entire dataset.